In [1]:
import numpy as np
import pandas as pd

Diagram of plant

Clean and Preprocess data

In [2]:
weather_file, gen_file = 'solar_panel_data\\Plant_2_Weather_Sensor_Data.csv', 'solar_panel_data\\Plant_2_Generation_Data.csv'
In [3]:
gen_df = pd.read_csv(gen_file, parse_dates=[0]).dropna()
weather_df = pd.read_csv(weather_file, parse_dates=[0]).dropna()
In [4]:
# Both datasets are time-series, so we can merge them together for easier manipulation
source_df = gen_df.merge(weather_df, on=["DATE_TIME", "PLANT_ID", "SOURCE_KEY"], how="outer").sort_values("DATE_TIME")
source_df
Out[4]:
DATE_TIME PLANT_ID SOURCE_KEY DC_POWER AC_POWER DAILY_YIELD TOTAL_YIELD AMBIENT_TEMPERATURE MODULE_TEMPERATURE IRRADIATION
0 2020-05-15 00:00:00 4136001 4UPUqMRk7TRMgml 0.0 0.0 9425.000000 2.429011e+06 NaN NaN NaN
21 2020-05-15 00:00:00 4136001 xoJJ8DcxJEcupym 0.0 0.0 0.000000 2.091436e+08 NaN NaN NaN
20 2020-05-15 00:00:00 4136001 xMbIugepa2P7lBB 0.0 0.0 9166.000000 1.066566e+08 NaN NaN NaN
19 2020-05-15 00:00:00 4136001 vOuJvMaM2sgwLmb 0.0 0.0 0.000000 2.211962e+06 NaN NaN NaN
18 2020-05-15 00:00:00 4136001 rrq4fwE8jgrTyWY 0.0 0.0 280.214286 1.209641e+08 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ...
67689 2020-06-17 23:45:00 4136001 WcxssY2VbP4hApt 0.0 0.0 4331.000000 1.819119e+08 NaN NaN NaN
67690 2020-06-17 23:45:00 4136001 mqwcsP2rE7J0TFp 0.0 0.0 4238.000000 5.938150e+08 NaN NaN NaN
67691 2020-06-17 23:45:00 4136001 oZ35aAeoifZaQzV 0.0 0.0 4467.000000 1.660189e+09 NaN NaN NaN
67684 2020-06-17 23:45:00 4136001 NgDl19wMapZy17u 0.0 0.0 4239.000000 1.117460e+08 NaN NaN NaN
70956 2020-06-17 23:45:00 4136001 iq8k7ZNt4Mwm3w0 NaN NaN NaN NaN 23.202871 22.535908 0.0

70957 rows × 10 columns

In [5]:
# Rename verbose sensor names
source_ids = source_df["SOURCE_KEY"].drop_duplicates().reset_index()["SOURCE_KEY"].reset_index()\
    .rename(columns={'index': 'source_id'})

# Based on the description of the data, 'DAILY_YIELD' and 'TOTAL_YIELD' are just power integrated over time.  
# These are not needed because we already have power.
features = ['DC_POWER', 'AC_POWER', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']

# Pivot for easier analysis
piv_source_df = source_df.merge(source_ids, on="SOURCE_KEY")\
    .pivot_table(index="DATE_TIME", columns=['source_id'], values=features)\
    .reset_index()\
    .sort_values('DATE_TIME')
In [6]:
# Flatten column names
piv_source_df.columns = [f"{str(col[1])}{col[0]}" for col in piv_source_df.columns.values]

piv_source_df["dt"] = (piv_source_df["DATE_TIME"] - piv_source_df["DATE_TIME"].shift(1)).apply(lambda x: x.total_seconds())
piv_source_df.dropna().groupby(["dt"]).size()  # There are samples missing
Out[6]:
dt
900.0     2349
1800.0       5
dtype: int64
In [7]:
# Interpolate over missing values

# Generate desired timestamps.  Data is already recorded at 15 minutes, so 15 minute periods work naturally
resampled_ts = pd.DataFrame(pd.date_range(start=piv_source_df["DATE_TIME"].min(), 
              end=piv_source_df["DATE_TIME"].max(), 
              freq="15min",
              name="DATE_TIME"))

# Will not interpolate over a period longer than 5*15min (==1:15:00) 
limit = 5
clean_df = piv_source_df.merge(resampled_ts, on="DATE_TIME", how="outer")\
    .interpolate(limit=limit)\
    .sort_values("DATE_TIME")\
    .reset_index()

# Timesteps are now always 15 minutes
clean_df["dt"] = (clean_df["DATE_TIME"] - clean_df["DATE_TIME"].shift(1)).apply(lambda x: x.total_seconds())
print(clean_df["dt"].drop_duplicates())
del clean_df["dt"]

# Can now set DATE_TIME as index
clean_df.set_index("DATE_TIME", inplace=True)
0      NaN
1    900.0
Name: dt, dtype: float64
In [8]:
# Look for areas where dimensionality reduction could be applied
correlation =  clean_df.corr()
high_corr = set()
for col in correlation.columns:
    for i, row in correlation.iterrows():
        if row[col] >= 0.9 and i != col:
            high_corr.add((tuple(sorted([i, col])), row[col]))
            
# We can see that this high-dimensional data could be reduced to a few dimensions
high_corr
Out[8]:
{(('0AC_POWER', '0DC_POWER'), 0.9999969431706845),
 (('0AC_POWER', '12AC_POWER'), 0.9614621258613769),
 (('0AC_POWER', '12DC_POWER'), 0.9613841919999251),
 (('0AC_POWER', '19AC_POWER'), 0.9282636380917713),
 (('0AC_POWER', '19DC_POWER'), 0.9281530273605476),
 (('0DC_POWER', '12AC_POWER'), 0.9614655887316773),
 (('0DC_POWER', '12DC_POWER'), 0.9613936228754041),
 (('0DC_POWER', '19AC_POWER'), 0.9282518055964978),
 (('0DC_POWER', '19DC_POWER'), 0.9281472248139893),
 (('10AC_POWER', '10DC_POWER'), 0.9999969457111311),
 (('11IRRADIATION', '11MODULE_TEMPERATURE'), 0.9469229529323987),
 (('11IRRADIATION', '16AC_POWER'), 0.9507592757450594),
 (('11IRRADIATION', '16DC_POWER'), 0.9508464576586836),
 (('11IRRADIATION', '20AC_POWER'), 0.9250673630919551),
 (('11IRRADIATION', '20DC_POWER'), 0.9251626894916435),
 (('11IRRADIATION', '2AC_POWER'), 0.952610454538705),
 (('11IRRADIATION', '2DC_POWER'), 0.9527003636012068),
 (('11IRRADIATION', '8AC_POWER'), 0.9305390675727165),
 (('11IRRADIATION', '8DC_POWER'), 0.9306335172468831),
 (('11MODULE_TEMPERATURE', '16AC_POWER'), 0.9124418660901152),
 (('11MODULE_TEMPERATURE', '16DC_POWER'), 0.9124687779613277),
 (('11MODULE_TEMPERATURE', '2AC_POWER'), 0.9127269239953465),
 (('11MODULE_TEMPERATURE', '2DC_POWER'), 0.9127479598123996),
 (('12AC_POWER', '12DC_POWER'), 0.9999969628427217),
 (('12AC_POWER', '19AC_POWER'), 0.9220988309137652),
 (('12AC_POWER', '19DC_POWER'), 0.9219964192542323),
 (('12DC_POWER', '19AC_POWER'), 0.9220170644634816),
 (('12DC_POWER', '19DC_POWER'), 0.9219205696386841),
 (('13AC_POWER', '13DC_POWER'), 0.9999968057881787),
 (('14AC_POWER', '14DC_POWER'), 0.9999967546736677),
 (('15AC_POWER', '15DC_POWER'), 0.9999973806888441),
 (('16AC_POWER', '16DC_POWER'), 0.9999967894147517),
 (('16AC_POWER', '19AC_POWER'), 0.9169491206942647),
 (('16AC_POWER', '19DC_POWER'), 0.9169854745973899),
 (('16AC_POWER', '20AC_POWER'), 0.9681194202649867),
 (('16AC_POWER', '20DC_POWER'), 0.9681331399727502),
 (('16AC_POWER', '2AC_POWER'), 0.9978876231717535),
 (('16AC_POWER', '2DC_POWER'), 0.9978734838375194),
 (('16AC_POWER', '8AC_POWER'), 0.9498202922484332),
 (('16AC_POWER', '8DC_POWER'), 0.9498213503308431),
 (('16DC_POWER', '19AC_POWER'), 0.9167421599717254),
 (('16DC_POWER', '19DC_POWER'), 0.9167846802742118),
 (('16DC_POWER', '20AC_POWER'), 0.9680593636389825),
 (('16DC_POWER', '20DC_POWER'), 0.9680794463579422),
 (('16DC_POWER', '2AC_POWER'), 0.9978813470691659),
 (('16DC_POWER', '2DC_POWER'), 0.9978736705162896),
 (('16DC_POWER', '8AC_POWER'), 0.9497369353516213),
 (('16DC_POWER', '8DC_POWER'), 0.9497441935124854),
 (('17AC_POWER', '17DC_POWER'), 0.9999979421948683),
 (('18AC_POWER', '18DC_POWER'), 0.9999968095897261),
 (('19AC_POWER', '19DC_POWER'), 0.9999968282458143),
 (('19AC_POWER', '20AC_POWER'), 0.900688904795358),
 (('19AC_POWER', '20DC_POWER'), 0.9005055811759867),
 (('19AC_POWER', '2AC_POWER'), 0.9161938413574564),
 (('19AC_POWER', '2DC_POWER'), 0.9159774198539586),
 (('19AC_POWER', '7AC_POWER'), 0.9418808324888726),
 (('19AC_POWER', '7DC_POWER'), 0.9418549035596224),
 (('19DC_POWER', '20AC_POWER'), 0.9006760510104297),
 (('19DC_POWER', '20DC_POWER'), 0.9004988625497657),
 (('19DC_POWER', '2AC_POWER'), 0.9162343222174334),
 (('19DC_POWER', '2DC_POWER'), 0.9160241747886086),
 (('19DC_POWER', '7AC_POWER'), 0.9417980598169906),
 (('19DC_POWER', '7DC_POWER'), 0.9417782870159102),
 (('1AC_POWER', '1DC_POWER'), 0.9999935082106991),
 (('20AC_POWER', '20DC_POWER'), 0.9999967487669542),
 (('20AC_POWER', '2AC_POWER'), 0.9643594933188062),
 (('20AC_POWER', '2DC_POWER'), 0.9642828015364787),
 (('20AC_POWER', '8AC_POWER'), 0.9822486486301677),
 (('20AC_POWER', '8DC_POWER'), 0.982232082620228),
 (('20DC_POWER', '2AC_POWER'), 0.9643665858530749),
 (('20DC_POWER', '2DC_POWER'), 0.9642963117812751),
 (('20DC_POWER', '8AC_POWER'), 0.9822228481026805),
 (('20DC_POWER', '8DC_POWER'), 0.9822126096507492),
 (('21AC_POWER', '21DC_POWER'), 0.9999968560371637),
 (('22AC_POWER', '22DC_POWER'), 0.9999971075813989),
 (('2AC_POWER', '2DC_POWER'), 0.999996686047773),
 (('2AC_POWER', '8AC_POWER'), 0.9461562496419815),
 (('2AC_POWER', '8DC_POWER'), 0.946148756113955),
 (('2DC_POWER', '8AC_POWER'), 0.9460587355808855),
 (('2DC_POWER', '8DC_POWER'), 0.9460574737942109),
 (('3AC_POWER', '3DC_POWER'), 0.9999968582163485),
 (('4AC_POWER', '4DC_POWER'), 0.9999970233806014),
 (('5AC_POWER', '5DC_POWER'), 0.999997047881089),
 (('6AC_POWER', '6DC_POWER'), 0.9999969251147974),
 (('7AC_POWER', '7DC_POWER'), 0.9999968754477893),
 (('8AC_POWER', '8DC_POWER'), 0.9999968404471669),
 (('9AC_POWER', '9DC_POWER'), 0.9999967353955947)}
In [9]:
# Use the average of the AC_POWER as the solar panel  power output metric
ac_power_cols = [col for col in clean_df.columns if "AC_POWER" in col]
clean_df["avg_ac_power"] = clean_df[ac_power_cols].mean(axis=1)

# Use the average of the DC_POWER as the inverter power output metric
dc_power_cols = [col for col in clean_df.columns if "DC_POWER" in col]
clean_df["avg_dc_power"] = clean_df[dc_power_cols].mean(axis=1)
clean_df["avg_dc_power"].plot()
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce04196e08>
In [10]:
# Remove AC power columns, as they are redundant with DC power, except DC power is affected by inverter efficiency
# Now have a reduced data set for faster analysis
df = clean_df.drop(columns=(ac_power_cols + dc_power_cols + ["index"]))

df.columns = [col.replace('11', '').lower() for col in df.columns]

Question 1: How does ambient temperature and solar irradiation affect solar panel power output?

In [12]:
import plotly.express as px
# The plot shows that irradiation is a much better predictor of solar panel output than ambient temperature
# It also shows that there are many outliers, where power output is much lower than typical
fig = px.scatter_3d(df, x='irradiation', y='ambient_temperature', z='avg_ac_power')
fig.show()
In [39]:
df.plot(x='irradiation', y ='avg_ac_power' )
df.plot(x='ambient_temperature', y ='avg_ac_power' )
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce0a20d548>
In [95]:
# Use a regression model
# The time-dependent effects can be ignored if the time delay of the effect of temperature and irradiation is << 15 minutes (the sample rate)
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

# Using Huber regression to lessen the effect of the outliers
from sklearn.linear_model import HuberRegressor
features = ['irradiation']
target =  ['avg_ac_power']
X = df [features]
y = df [target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

regressor = HuberRegressor()
regressor.fit(X_train, y_train)

y_prediction = regressor.predict(X_test)

RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))

predict_df = y_test.copy()
predict_df['predict'] = y_prediction
predict_df.plot()

# Performance using linear regression is good enough 
RMSE
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

Out[95]:
116.14174181587525

Question 2: How does ambient temperature affect solar panel efficiency?

In [ ]:
 
In [138]:
df["panel_eff_metric"] = df["avg_ac_power"]/df["irradiation"]

# Drop NaNs due to 0 irradiation
q2_df = df[df["irradiation"] != 0]

q2_df.plot(x='irradiation', y ='panel_eff_metric')
Out[138]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce12d3c188>
In [144]:
# Using polynomial fit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

features = ['ambient_temperature']
target =  ['panel_eff_metric']
X = q2_df[features]
y = q2_df[target]

scale = StandardScaler()
X_scaled = scale.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33)

poly_reg = PolynomialFeatures(degree=3)
X_poly = poly_reg.fit_transform(X_train)
poly_reg.fit(X_poly,y_train)


regressor = LinearRegression()
regressor.fit(X_poly, y_train)

y_prediction = regressor.predict(poly_reg.fit_transform(X_test))

RMSE = sqrt(mean_squared_error(y_true = y_test, y_pred = y_prediction))

predict_df = y_test.copy()
predict_df['predict'] = y_prediction
predict_df.plot()

predict_df.plot(x='')

# Performance using linear regression is poor
RMSE
Out[144]:
407.21978199897734
In [ ]:
# Age of the inverter/solar panel would be an important feature for determining degradation over time
# Module temperature could also influence inverter efficiency
df["age"] = df.index - df.index[0]
df["age"] = df["age"].apply(lambda x: x.total_seconds())
df["inv_eff_metric"] = df["avg_dc_power"]/df["avg_ac_power"]

Questions:

  • How does irradiation affect power output?
  • Does module temperature affect inverter efficiency?
  • Does module age influence inverter efficiency?
  • Does solar panel age affect solar panel efficiency?